# List of packages to install
packages_to_install <- c("hms", "lubridate", "tidytext", "tm", "wordcloud",
"igraph", "glue", "networkD3", "plyr", "stringr",
"ggplot2", "ggeasy", "plotly", "dplyr", "hms",
"lubridate", "magrittr", "tidyverse", "janeaustenr",
"widyr")
# Install packages
#chooseCRANmirror(graphics=FALSE)
#install.packages(packages_to_install)
library(hms)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:hms':
##
## hms
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidytext)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(glue)
library(networkD3)
library(plyr)
library(stringr)
## Warning: package 'stringr' was built under R version 4.2.3
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(ggeasy)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hms)
library(lubridate)
library(magrittr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ igraph::%--%() masks lubridate::%--%()
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::arrange() masks plotly::arrange(), plyr::arrange()
## ✖ tibble::as_data_frame() masks dplyr::as_data_frame(), igraph::as_data_frame()
## ✖ purrr::compact() masks plyr::compact()
## ✖ purrr::compose() masks igraph::compose()
## ✖ dplyr::count() masks plyr::count()
## ✖ tidyr::crossing() masks igraph::crossing()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ lubridate::hms() masks hms::hms()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plotly::mutate(), plyr::mutate()
## ✖ dplyr::rename() masks plotly::rename(), plyr::rename()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ purrr::simplify() masks igraph::simplify()
## ✖ dplyr::summarise() masks plotly::summarise(), plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janeaustenr)
library(widyr)
file_path <- "data/Cleaned_ronaldo_tweets.csv"
tweets_df <- read.csv(file_path)
str(tweets_df)
## 'data.frame': 501926 obs. of 10 variables:
## $ tweet_id : num 1.55e+18 1.55e+18 1.55e+18 1.55e+18 1.55e+18 ...
## $ author_id : num 1.41e+18 1.33e+18 1.19e+18 7.27e+17 1.32e+18 ...
## $ content : chr "we fell down to with ronaldo and no big team who wins trophies want him yet you want him at to be our main stri"| __truncated__ "man utd transfer news live frenkie de jong final bid latest cristiano ronaldo admission tielemans interested" "r and ronaldinho make me smile when i saw than on the field stats of course cr but loving football r and ronaldinho" "ronaldo was in the best champions league team ever winning in a row poor pessi was losing and getting sacked fr"| __truncated__ ...
## $ lang : chr "en" "en" "en" "en" ...
## $ date : chr "2022-08-02T07:34:06.000Z" "2022-08-02T07:34:00.000Z" "2022-08-02T07:33:40.000Z" "2022-08-02T07:33:38.000Z" ...
## $ source : chr "Twitter for Android" "Publer.io" "Twitter for iPhone" "Twitter for Android" ...
## $ geo : chr "-1" "-1" "-1" "-1" ...
## $ retweet_count: num 0 0 0 0 0 0 0 0 0 0 ...
## $ like_count : num 0 0 0 0 0 0 0 0 0 0 ...
## $ quote_count : num 0 0 0 0 0 0 0 0 0 0 ...
# load sentiment
positive = scan('data/resources/positive-words.txt', what = 'character', comment.char = ';')
negative = scan('data/resources/negative-words.txt', what = 'character', comment.char = ';')
# add your list of words below as you wish if missing in above read lists
pos.words = c(positive,'upgrade','Congrats','prizes','prize','thanks','thnx',
'Grt','gr8','plz','trending','recovering','brainstorm','leader')
neg.words = c(negative,'wtf','wait','waiting','epicfail','Fight','fighting',
'arrest','no','not')
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
require(plyr)
require(stringr)
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
chunk_size <- 1000 # Adjust the size based on your preference
# Get the number of chunks
num_chunks <- ceiling(nrow(tweets_df) / chunk_size)
print(num_chunks)
## [1] 502
library(plotly)
cleanText <- tweets_df$content
analysis <- score.sentiment(cleanText, pos.words, neg.words)
table(analysis$score)
##
## -15 -13 -10 -9 -8 -7 -6 -5 -4 -3 -2
## 1 3 19 24 59 158 431 1126 3259 9398 28652
## -1 0 1 2 3 4 5 6 7 8 9
## 83272 208448 105790 38064 14860 5216 1964 755 289 84 26
## 10 11 12 13 14 17
## 14 6 4 2 1 1
# plot of sentiment frequencies
analysis %>%
ggplot(aes(x=score)) +
geom_histogram(binwidth = 1, fill = "lightblue")+
ylab("Frequency") +
xlab("sentiment score") +
ggtitle("Distribution of Sentiment scores of the tweets") +
ggeasy::easy_center_title()
library(plotly)
# Convert 'date' variable to datetime format
tweets_df$date <- lubridate::as_datetime(tweets_df$date)
# Extract date and sentiment score
date_sentiment <- data.frame(date = tweets_df$date, sentiment = analysis$score)
# Plot sentiment over time
date_sentiment %>%
ggplot(aes(x = date, y = sentiment)) +
geom_line() +
ylab("Sentiment Score") +
xlab("Date") +
ggtitle("Sentiment Over Time") +
ggeasy::easy_center_title()
# Plot sentiment over time using Plotly
plot_ly(date_sentiment, x = ~date, y = ~sentiment, type = "scatter", mode = "lines") %>%
layout(
yaxis = list(title = "Sentiment Score"),
xaxis = list(title = "Date"),
title = "Sentiment Over Time"
)